1. Data preparation

Read data

SE object created from exploratory analysis.

Gene selection according to Biotype already performed

SE_Bio <- readRDS(paste0(params$SE_Bio))

Convert ENSG annotation to Gene Symbol before Differential Expression Analysis and duplicate gene names removal

Duplicated gene names are dropped and gene IDs are set as rownames.

The number of duplicated GeneName is: 11899

The number of duplicated Ensembl Genes with version is: 0

SE_Bio <- SE_Bio[!duplicated(rowData(SE_Bio)$GeneName), ]
rownames(SE_Bio) <- rowData(SE_Bio)$GeneName
SE_Bio
## class: SummarizedExperiment 
## dim: 24708 36 
## metadata(0):
## assays(1): counts
## rownames(24708): TSPAN6 TNMD ... LNCDAT CDR1
## rowData names(9): Gene EnsGene ... Start End
## colnames(36): OLE_001 OLE_002 ... OLE_048 OLE_049
## colData names(11): SampleNumber SampleID_GU ... HRID lib.size
ScaledCols <- c('darkblue', "purple","white","lightgoldenrod1", 'goldenrod1')

padj_threshold = params$padj_threshold
log2fc_threshold = params$log2fc_threshold

Filtering Thresholds are set to:

  • Log2FC = 1.5
  • FDR = 0.05

Sample selection

Only day50 Cortical Brain Organoids are kept

SE_Bio_d50CbO <- SE_Bio[, colData(SE_Bio)$Timepoint %in% 'd50']
colData(SE_Bio_d50CbO)
## DataFrame with 14 rows and 11 columns
##         SampleNumber SampleID_GU SampleID_OG             SampleName        SampleType    Genotype
##            <integer> <character> <character>            <character>       <character> <character>
## OLE_010            7   D50_3a_WT   D50_3a_WT ORG_CHD2_WT_round3_d.. CorticalOrganoids          WT
## OLE_011            8   D50_3a_HT   D50_3a_HT ORG_CHD2_HT_round3_d.. CorticalOrganoids          HT
## OLE_016           11    D50_3_WT    D50_3_WT ORG_CHD2_WT_round3_d.. CorticalOrganoids          WT
## OLE_017           12    D50_3_HT    D50_3_HT ORG_CHD2_HT_round3_d.. CorticalOrganoids          HT
## OLE_040           27     OLE_040        OL15        15_Evo1_d50_WT3 CorticalOrganoids          WT
## ...              ...         ...         ...                    ...               ...         ...
## OLE_045           32     OLE_045        OL20    20_Evo3_d50_WTC_1-2 CorticalOrganoids          WT
## OLE_046           33     OLE_046        OL21       21_Evo3_d50_A5_1 CorticalOrganoids          AR
## OLE_047           34     OLE_047        OL22      22_Evo4_d50_WTC-A CorticalOrganoids          WT
## OLE_048           35     OLE_048        OL23       23_Evo4_d50_A5-A CorticalOrganoids          AR
## OLE_049           36     OLE_049        OL24      24_Evo4_d50_HET-A CorticalOrganoids          HT
##           Timepoint       Batch    SeqRun        HRID  lib.size
##         <character> <character> <integer> <character> <numeric>
## OLE_010         d50      Round3         1     OLE_010  14450007
## OLE_011         d50      Round3         1     OLE_011  29379109
## OLE_016         d50      Round3         1     OLE_016  15451960
## OLE_017         d50      Round3         1     OLE_017  17477648
## OLE_040         d50        Evo1         2     OLE_040  18977971
## ...             ...         ...       ...         ...       ...
## OLE_045         d50        Evo3         2     OLE_045  18918665
## OLE_046         d50        Evo3         2     OLE_046  12915310
## OLE_047         d50        Evo4         2     OLE_047  25159924
## OLE_048         d50        Evo4         2     OLE_048  17803005
## OLE_049         d50        Evo4         2     OLE_049  20109379

DESeq2

2. Generation of the dds object

  • DDS object is generated from the Count Matrix and Sample Metadata stored in the SE_Bio object
  • Genotype is specified for the design: Heterozygous, Wildtype.
CountMatrix <- assays(SE_Bio_d50CbO)$counts
SampleMeta <- DataFrame(colData(SE_Bio_d50CbO))

all(rownames(SampleMeta) == colnames(CountMatrix))
## [1] TRUE
dds <- DESeqDataSetFromMatrix(countData=assays(SE_Bio_d50CbO)$counts, DataFrame(colData(SE_Bio_d50CbO)), design = ~Batch+Genotype)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in design formula are
## characters, converting to factors

mcols(dds) <- DataFrame(mcols(dds), DataFrame(rowData(SE_Bio_d50CbO)))

dds$Genotype <- factor(dds$Genotype, levels = c("WT", "AR", 'HT')) #no need to specify as column is already ordered, but safer
dds$Genotype <- relevel(dds$Genotype, ref = "WT")


dds$Genotype
##  [1] WT HT WT HT WT AR AR HT HT WT AR WT AR HT
## Levels: WT AR HT

dds
## class: DESeqDataSet 
## dim: 24708 14 
## metadata(1): version
## assays(1): counts
## rownames(24708): TSPAN6 TNMD ... LNCDAT CDR1
## rowData names(9): Gene EnsGene ... Start End
## colnames(14): OLE_010 OLE_011 ... OLE_048 OLE_049
## colData names(11): SampleNumber SampleID_GU ... HRID lib.size

Inspection of genes with zero counts

ZeroPlot <- CountMatrix %>% 
  mutate(row = row_number()) %>%
  pivot_longer(cols = -row, names_to = "col", values_to = "value") %>% 
  filter(value == 0) %>%
  group_by(col) %>%
  summarise(zerocount = n())  %>%
  ggplot(., aes(y=col, x=zerocount)) +
  geom_col(col='black', fill='#76c8c8') +
  coord_flip() + 
  geom_label(aes(label=zerocount)) + 
  labs(title=paste0('Number of genes with zero counts ', '(out of ', nrow(CountMatrix), ' genes)'),
       y='', x='') +
  theme_bw() +
  theme(axis.text = element_text(colour = 'black', size=7),
        axis.text.x = element_text(angle=45, hjust = 0.5, vjust=0.5),
        plot.title = element_text(hjust = 0.5, size = 7))

ZeroPlot

Filtering of the dds object

  • I focus on protein-coding and lncRNA genes only (other biotypes were already removed from the SE_Bio object)
  • I implement a filter on the expression of at least 5 reads in at least 2 samples
keep <- rowSums(counts(dds)>=5) >= 2 #changed from 5 ##changed from 10 and 2 samples from 3

table(keep)
## keep
## FALSE  TRUE 
##  7035 17673
dds <- dds[keep,]

dds
## class: DESeqDataSet 
## dim: 17673 14 
## metadata(1): version
## assays(1): counts
## rownames(17673): TSPAN6 TNMD ... NPBWR1 PDCD6-AHRR
## rowData names(9): Gene EnsGene ... Start End
## colnames(14): OLE_010 OLE_011 ... OLE_048 OLE_049
## colData names(11): SampleNumber SampleID_GU ... HRID lib.size

A dds object containing information about 17673 genes in 14 samples is tested for differential expression.


3. Differential Expression

dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
sizeFactors(dds)
##   OLE_010   OLE_011   OLE_016   OLE_017   OLE_040   OLE_041   OLE_042   OLE_043   OLE_044 
## 0.7390996 1.4760943 0.7919811 0.8709701 0.9803307 1.1987549 1.0634332 1.1595120 1.0535215 
##   OLE_045   OLE_046   OLE_047   OLE_048   OLE_049 
## 1.0511518 0.7119825 1.3296681 0.9142719 1.0500573

Inspection of the dds object

Top50 genes heatmap

select <- order(rowMeans(counts(dds,normalized=TRUE)),
                decreasing=TRUE)[1:50]
ntd <- normTransform(dds)
df <- as.data.frame(colData(dds)[,c("Genotype")])
colnames(df) <- 'Genotype'
#df$Run <- c(rep('rep1', 3), rep('rep2', 3))

rownames(df) <- colnames(ntd)

pheatmap(assay(ntd)[select,], cluster_rows=FALSE, show_rownames=FALSE,
         cluster_cols=TRUE, annotation_col=df, border_color = 'black')

Samples distances

vsd <- vst(dds, blind=FALSE)
sampleDists <- dist(t(assay(vsd)))

sampleDistMatrix <- as.matrix(sampleDists)
#rownames(sampleDistMatrix) <- paste(vsd$condition, vsd$type, sep="-")
#colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette(rev(RColorBrewer::brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors, main = 'Samples Distances', name = 'vst')

Counts Distribution

SF <- data.frame(Sample=names(sizeFactors(dds)), SizeF=sizeFactors(dds))

par(mfrow=c(2,2),cex.lab=0.7)
boxplot(log2(counts(dds)),  col=ScaledCols, cex.axis=0.7, 
        las=1, xlab="log2(counts)", horizontal=TRUE, main="Raw counts")
boxplot(log2(counts(dds, normalized=TRUE)),  col=ScaledCols, cex.axis=0.7, 
        las=1, xlab="log2(normalized counts)", horizontal=TRUE, main="Normalized counts") 
plotDensity(log2(counts(dds)),  col=ScaledCols, 
            xlab="log2(counts)", cex.lab=0.7, panel.first=grid()) 
plotDensity(log2(counts(dds, normalized=TRUE)), col=ScaledCols, 
            xlab="log2(normalized counts)", cex.lab=0.7, panel.first=grid()) 

Dispersion estimate

plotDispEsts(dds)


Extract Result of contrast: HT vs WT

res_dds_ht <- results(dds, contrast=c("Genotype","HT","WT"), alpha=0.05, cooksCutoff=0.99)
summary(res_dds_ht)
## 
## out of 17673 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 2887, 16%
## LFC < 0 (down)     : 2714, 15%
## outliers [1]       : 0, 0%
## low counts [2]     : 343, 1.9%
## (mean count < 2)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

metadata(res_dds_ht)$filterThreshold
## 1.938776% 
##  2.407767
metadata(res_dds_ht)$alpha
## [1] 0.05

plot(metadata(res_dds_ht)$filterNumRej, 
     type="b", ylab="number of rejections",
     xlab="quantiles of filter", main='Heterozygous',
     cex.lab=0.5, cex.axis=0.5, cex.main=0.5)
lines(metadata(res_dds_ht)$lo.fit, col="red")
abline(v=metadata(res_dds_ht)$filterTheta)

Top results from res table sorted by adjusted Pvalue for HT vs WT

head(res_dds_ht[order(res_dds_ht$padj),])
## log2 fold change (MLE): Genotype HT vs WT 
## Wald test p-value: Genotype HT vs WT 
## DataFrame with 6 rows and 6 columns
##         baseMean log2FoldChange     lfcSE      stat       pvalue         padj
##        <numeric>      <numeric> <numeric> <numeric>    <numeric>    <numeric>
## ZNF117  2766.734       -7.54332 0.1549884  -48.6703  0.00000e+00  0.00000e+00
## ERV3-1  1492.936       -7.53693 0.1759269  -42.8412  0.00000e+00  0.00000e+00
## ZNF562   382.685       -4.36925 0.1900274  -22.9927 5.51293e-117 3.18464e-113
## INPP5F  2050.753        1.52067 0.0686312   22.1572 8.89688e-109 3.85457e-105
## ZNF273   235.836       -6.33985 0.3182608  -19.9203  2.71381e-88  9.40606e-85
## ZNF471   418.291       -4.90022 0.2575796  -19.0241  1.07736e-80  3.11177e-77

  • Genes modulated considering a FDR threshold of 0.1: 6638

  • Genes modulated considering a FDR threshold of 0.05: 5601

  • Genes modulated considering a FDR threshold of 0.05 and FC threshold of 1.5: 2202

  • Genes modulated considering a FDR threshold of 0.05 and FC threshold of 2: 1136


Fold-change shrinkage

Since I am using the constrast option to retrieve the results, I cannot rely on apleglm default algorithm for logFC shrinkage. Since at the moment I am not using the lfcThreshold option, I decide for the ashr algorithm.

resAshr_ht <- lfcShrink(dds, contrast=c("Genotype","HT","WT"), res=res_dds_ht, type="ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
summary(resAshr_ht)
## 
## out of 17673 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 2887, 16%
## LFC < 0 (down)     : 2714, 15%
## outliers [1]       : 0, 0%
## low counts [2]     : 343, 1.9%
## (mean count < 2)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Heterozygous

#par(mfrow=c(1,2), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-6,6)
DESeq2::plotMA(res_dds_ht, xlim=xlim, ylim=ylim, main="no LFC shrink")

DESeq2::plotMA(resAshr_ht, xlim=xlim, ylim=ylim, main="LFC shrink with ashr algorithm")


Top results from res table sorted by adjpvalue after LFC shrinkage

head(resAshr_ht[order(resAshr_ht$padj),])
## log2 fold change (MMSE): Genotype HT vs WT 
## Wald test p-value: Genotype HT vs WT 
## DataFrame with 6 rows and 5 columns
##         baseMean log2FoldChange     lfcSE       pvalue         padj
##        <numeric>      <numeric> <numeric>    <numeric>    <numeric>
## ZNF117  2766.734       -7.53678 0.1549485  0.00000e+00  0.00000e+00
## ERV3-1  1492.936       -7.52850 0.1758691  0.00000e+00  0.00000e+00
## ZNF562   382.685       -4.35450 0.1904841 5.51293e-117 3.18464e-113
## INPP5F  2050.753        1.50850 0.0685866 8.89688e-109 3.85457e-105
## ZNF273   235.836       -6.31315 0.3184744  2.71381e-88  9.40606e-85
## ZNF471   418.291       -4.87726 0.2584989  1.07736e-80  3.11177e-77

  • Genes modulated considering a FDR threshold of 0.1: 6638

  • Genes modulated considering a FDR threshold of 0.05: 5601

  • Genes modulated considering a FDR threshold of 0.05 and FC threshold of 1.5: 1744

  • Genes modulated considering a FDR threshold of 0.05 and FC threshold of 2: 708


Log Fold change shrinkage doesn’t change the results significantly, thus I decided to keep the standard DESeq2 workflow for testing differential gene expression.


Extract DEGs

  • Here I extract the DEGs passing adjusted Pvalue and logFC thresholds, for HT vs WT contrast (deseqDEGsHT)
deseqDEGsHT <- dplyr::filter(data.frame(res_dds_ht), padj < padj_threshold, abs(log2FoldChange) > log2(log2fc_threshold))

deseqDEGsHTashr <- dplyr::filter(data.frame(resAshr_ht), padj < padj_threshold, abs(log2FoldChange) > log2(log2fc_threshold))

Results Visualization

SE_deseq2 <- as(dds, "RangedSummarizedExperiment")
assays(SE_deseq2)$vst <- assay(vst(dds, blind=TRUE))
FdrCeil = 1e-10
logFcCeil = 7.5

Plotting Ceilings are set to:

  • Log2FC 7.5
  • FDR 10^{-10}

Volcano

Heterozygous

#rename(as.data.frame(res_dds_ht), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = 0.05, LabellingCutoff = 0.01, FCCutoff = 2, main = 'HT vs WT')

dplyr::rename(as.data.frame(res_dds_ht), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = padj_threshold, LabellingCutoff = 0.01, FCCutoff = log2fc_threshold, main = 'HT vs WT (day50 CO)') + labs(y='Log FoldChange') + xlim(0, -log10(FdrCeil)) + ylim(-logFcCeil, logFcCeil)

Heterozygous LFC shrink

#rename(as.data.frame(res_dds_ht), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = 0.05, LabellingCutoff = 0.01, FCCutoff = 2, main = 'HT vs WT')

dplyr::rename(as.data.frame(resAshr_ht), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = padj_threshold, LabellingCutoff = 0.01, FCCutoff = log2fc_threshold, main = 'HT vs WT (day50 CO)') + labs(y='Log FoldChange') + xlim(0, -log10(FdrCeil)) + ylim(-logFcCeil, logFcCeil)


9. Savings

DEAList <- list()

DEAList <- list(dds = dds,  #same for both
                HT = list(res = res_dds_ht, 
                          DEGs = deseqDEGsHT,
                          resAshr =resAshr_ht,
                          DEGsAshr = deseqDEGsHTashr))
# RDS
saveRDS(DEAList, paste0(SavingFolder, '/day50CbO.', 'DEAList_HT.rds')) 

saveRDS(SE_deseq2, paste0(SavingFolder, '/day50CbO.', 'SE_deseq2_HT.rds')) 

saveRDS(res_dds_ht, paste0(SavingFolder, '/day50CbO.', 'deseqHTvsWT.rds')) 

SessionInfo <- sessionInfo()
Date <- date()
save.image(paste0(SavingFolder, '/day50CbO.', 'DEAnalysisWorkspace_HT.RData'))

last update on: Fri Jan 10 19:02:00 2025